In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
from sklearn import ensemble

import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb

from sklearn import metrics
from sklearn import model_selection

import pvlib
import cs_detection

import visualize_plotly as visualize
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()
init_notebook_mode(connected=True)

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

Ground predictions

PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.

In [2]:
nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [3]:
len(nsrdb.df)
Out[3]:
315552

Train/test on NSRDB data to find optimal parameters

Default classifier

In [4]:
train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)
In [5]:
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [6]:
clf = ensemble.RandomForestClassifier(random_state=42)
In [7]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [8]:
feature_cols = [
    'tfn',
    'abs_ideal_ratio_diff grad',
    'abs_ideal_ratio_diff grad mean', 
    'abs_ideal_ratio_diff grad std',
    'abs_ideal_ratio_diff grad second',
    'abs_ideal_ratio_diff grad second mean',
    'abs_ideal_ratio_diff grad second std',
    'GHI Clearsky GHI pvlib line length ratio',
    'abs_ideal_ratio_diff', 
    'abs_ideal_ratio_diff mean',
    'abs_ideal_ratio_diff std',
    'GHI Clearsky GHI pvlib abs_diff',
    'GHI Clearsky GHI pvlib abs_diff mean', 
    'GHI Clearsky GHI pvlib abs_diff std'
]

target_cols = ['sky_status']
In [9]:
vis = visualize.Visualizer()
vis.plot_corr_matrix(train.df[feature_cols].corr(), feature_cols)
/Users/benellis/miniconda3/lib/python3.5/site-packages/seaborn/palettes.py:727: DeprecationWarning:

object of type <class 'float'> cannot be safely interpreted as an integer.

/Users/benellis/miniconda3/lib/python3.5/site-packages/seaborn/palettes.py:727: DeprecationWarning:

object of type <class 'float'> cannot be safely interpreted as an integer.

In [17]:
train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)
In [11]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[11]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)
In [12]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

In [13]:
metrics.accuracy_score(test.df['sky_status'], pred)
Out[13]:
0.94417808219178079
In [14]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
In [15]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
In [16]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

Gridsearch

In [18]:
import warnings
In [19]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    params={}
    params['max_depth'] = [4, 8, 12, 16]
    params['n_estimators'] = [64]
    params['class_weight'] = [None, 'balanced']
    params['min_samples_leaf'] = [1, 2, 3]
    results = []
    for depth, nest, cw, min_samples in itertools.product(params['max_depth'], params['n_estimators'], params['class_weight'], params['min_samples_leaf']):
        print('Params:')
        print('depth: {}, n_estimators: {}, class_weight: {}, min_samples_leaf: {}'.format(depth, nest, cw, min_samples))
        train2 = cs_detection.ClearskyDetection(train.df)
        train2.trim_dates('01-01-1999', '01-01-2014')
        utils.calc_all_window_metrics(train2.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
        test2 = cs_detection.ClearskyDetection(train.df)
        test2.trim_dates('01-01-2014', '01-01-2015')
        clf = ensemble.RandomForestClassifier(max_depth=depth, n_estimators=nest, class_weight=cw, min_samples_leaf=min_samples, n_jobs=-1, random_state=42)
        clf.fit(train2.df[train2.df['GHI'] > 0][feature_cols].values, train2.df[train2.df['GHI'] > 0][target_cols].values.flatten())
        
        print('\t Scores:')
        test_pred = test2.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True)
        accuracy_score = metrics.accuracy_score(test2.df['sky_status'], test_pred)
        print('\t\t accuracy: {}'.format(accuracy_score))
        f1_score = metrics.f1_score(test2.df['sky_status'], test_pred)
        print('\t\t f1:{}'.format(f1_score))
        recall_score = metrics.recall_score(test2.df['sky_status'], test_pred)
        print('\t\t recall:{}'.format(recall_score))
        precision_score = metrics.precision_score(test2.df['sky_status'], test_pred)
        print('\t\t precision:{}'.format(precision_score))
        results.append({'max_depth': depth, 'n_estimators': nest, 'class_weight': cw, 'min_samples_leaf': min_samples,
                        'accuracy': accuracy_score, 'f1': f1_score, 'recall': recall_score, 'precision': precision_score})
Params:
depth: 4, n_estimators: 64, class_weight: None, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.948458904109589
		 f1:0.9082969432314411
		 recall:0.9303099646349074
		 precision:0.8873015873015873
Params:
depth: 4, n_estimators: 64, class_weight: None, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.948458904109589
		 f1:0.9082969432314411
		 recall:0.9303099646349074
		 precision:0.8873015873015873
Params:
depth: 4, n_estimators: 64, class_weight: None, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.948458904109589
		 f1:0.9082969432314411
		 recall:0.9303099646349074
		 precision:0.8873015873015873
Params:
depth: 4, n_estimators: 64, class_weight: balanced, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9480022831050229
		 f1:0.9072206945717487
		 recall:0.9265654254212606
		 precision:0.8886671987230647
Params:
depth: 4, n_estimators: 64, class_weight: balanced, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9480022831050229
		 f1:0.9072206945717487
		 recall:0.9265654254212606
		 precision:0.8886671987230647
Params:
depth: 4, n_estimators: 64, class_weight: balanced, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9480022831050229
		 f1:0.9072206945717487
		 recall:0.9265654254212606
		 precision:0.8886671987230647
Params:
depth: 8, n_estimators: 64, class_weight: None, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9509132420091324
		 f1:0.9134286289510771
		 recall:0.9438319117952986
		 precision:0.8849229568948703
Params:
depth: 8, n_estimators: 64, class_weight: None, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9509703196347032
		 f1:0.9135379969803724
		 recall:0.9440399417516122
		 precision:0.8849453978159126
Params:
depth: 8, n_estimators: 64, class_weight: None, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9508561643835617
		 f1:0.9133715665559916
		 recall:0.944247971707926
		 precision:0.8844505066250974
Params:
depth: 8, n_estimators: 64, class_weight: balanced, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9505707762557077
		 f1:0.9125429206220965
		 recall:0.939879342625338
		 precision:0.8867517173699706
Params:
depth: 8, n_estimators: 64, class_weight: balanced, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9508561643835617
		 f1:0.9129687657939957
		 recall:0.9394632827127106
		 precision:0.8879276445143531
Params:
depth: 8, n_estimators: 64, class_weight: balanced, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9509703196347032
		 f1:0.9133286247603671
		 recall:0.9415435822758477
		 precision:0.8867554858934169
Params:
depth: 12, n_estimators: 64, class_weight: None, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9504566210045662
		 f1:0.9124823553135714
		 recall:0.941335552319534
		 precision:0.8853453335942085
Params:
depth: 12, n_estimators: 64, class_weight: None, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9503995433789955
		 f1:0.9124433249370277
		 recall:0.9419596421884752
		 precision:0.8847205939820242
Params:
depth: 12, n_estimators: 64, class_weight: None, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9501712328767123
		 f1:0.9120048382219534
		 recall:0.9411275223632203
		 precision:0.8846304262807978
Params:
depth: 12, n_estimators: 64, class_weight: balanced, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9492579908675799
		 f1:0.910065756196257
		 recall:0.9357187434990638
		 precision:0.8857818038597873
Params:
depth: 12, n_estimators: 64, class_weight: balanced, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9498287671232877
		 f1:0.9110233829334953
		 recall:0.9361348034116913
		 precision:0.887223974763407
Params:
depth: 12, n_estimators: 64, class_weight: balanced, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9493721461187214
		 f1:0.9103043786024877
		 recall:0.936342833368005
		 precision:0.8856749311294766
Params:
depth: 16, n_estimators: 64, class_weight: None, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9485730593607306
		 f1:0.9089990910009089
		 recall:0.9361348034116913
		 precision:0.8833922261484098
Params:
depth: 16, n_estimators: 64, class_weight: None, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9489155251141552
		 f1:0.9093487288564772
		 recall:0.9338464738922405
		 precision:0.8861034346624556
Params:
depth: 16, n_estimators: 64, class_weight: None, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9488013698630137
		 f1:0.9093481556341586
		 recall:0.9359267734553776
		 precision:0.8842374213836478
Params:
depth: 16, n_estimators: 64, class_weight: balanced, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9477168949771689
		 f1:0.9074373484236055
		 recall:0.9340545038485542
		 precision:0.8822951463941835
Params:
depth: 16, n_estimators: 64, class_weight: balanced, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9488584474885845
		 f1:0.9092750101255569
		 recall:0.9340545038485542
		 precision:0.8857762872361412
Params:
depth: 16, n_estimators: 64, class_weight: balanced, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9491438356164383
		 f1:0.9096623745310758
		 recall:0.9332223840232994
		 precision:0.8872626582278481
In [20]:
runs_df = pd.DataFrame(results)
In [21]:
runs_df.to_csv('8_abq_non_directional_features.csv')
In [22]:
runs_df[['accuracy', 'f1', 'recall', 'precision']].iplot(kind='box')
In [23]:
runs_df
Out[23]:
accuracy class_weight f1 max_depth min_samples_leaf n_estimators precision recall
0 0.948459 None 0.908297 4 1 64 0.887302 0.930310
1 0.948459 None 0.908297 4 2 64 0.887302 0.930310
2 0.948459 None 0.908297 4 3 64 0.887302 0.930310
3 0.948002 balanced 0.907221 4 1 64 0.888667 0.926565
4 0.948002 balanced 0.907221 4 2 64 0.888667 0.926565
5 0.948002 balanced 0.907221 4 3 64 0.888667 0.926565
6 0.950913 None 0.913429 8 1 64 0.884923 0.943832
7 0.950970 None 0.913538 8 2 64 0.884945 0.944040
8 0.950856 None 0.913372 8 3 64 0.884451 0.944248
9 0.950571 balanced 0.912543 8 1 64 0.886752 0.939879
10 0.950856 balanced 0.912969 8 2 64 0.887928 0.939463
11 0.950970 balanced 0.913329 8 3 64 0.886755 0.941544
12 0.950457 None 0.912482 12 1 64 0.885345 0.941336
13 0.950400 None 0.912443 12 2 64 0.884721 0.941960
14 0.950171 None 0.912005 12 3 64 0.884630 0.941128
15 0.949258 balanced 0.910066 12 1 64 0.885782 0.935719
16 0.949829 balanced 0.911023 12 2 64 0.887224 0.936135
17 0.949372 balanced 0.910304 12 3 64 0.885675 0.936343
18 0.948573 None 0.908999 16 1 64 0.883392 0.936135
19 0.948916 None 0.909349 16 2 64 0.886103 0.933846
20 0.948801 None 0.909348 16 3 64 0.884237 0.935927
21 0.947717 balanced 0.907437 16 1 64 0.882295 0.934055
22 0.948858 balanced 0.909275 16 2 64 0.885776 0.934055
23 0.949144 balanced 0.909662 16 3 64 0.887263 0.933222

Best recall model

In [24]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [25]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [26]:
best_recall = runs_df.iloc[runs_df['recall'].idxmax()]
In [27]:
params_recall = best_recall[['max_depth', 'n_estimators', 'class_weight', 'min_samples_leaf']].to_dict()
In [28]:
params_recall
Out[28]:
{'class_weight': None,
 'max_depth': 8,
 'min_samples_leaf': 3,
 'n_estimators': 64}
In [29]:
clf = ensemble.RandomForestClassifier(**params_recall, n_jobs=-1)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[29]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=64, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [30]:
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
In [31]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

In [32]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
In [33]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

Best accuracy model

In [34]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [35]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [36]:
best_accuracy = runs_df.iloc[runs_df['accuracy'].idxmax()]
In [37]:
print(best_accuracy.equals(best_recall))
False
In [38]:
params_accuracy = best_accuracy[['max_depth', 'n_estimators', 'class_weight', 'min_samples_leaf']].to_dict()
In [39]:
params_accuracy
Out[39]:
{'class_weight': None,
 'max_depth': 8,
 'min_samples_leaf': 2,
 'n_estimators': 64}
In [40]:
clf = ensemble.RandomForestClassifier(**params_accuracy, n_jobs=-1)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[40]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=64, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [41]:
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
In [42]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

In [43]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
In [44]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

Best precision model

In [45]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [46]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [47]:
best_precision = runs_df.iloc[runs_df['precision'].idxmax()]
In [48]:
print(best_precision.equals(best_recall))
print(best_precision.equals(best_accuracy))
False
False
In [49]:
params_precision = best_precision[['max_depth', 'n_estimators', 'class_weight', 'min_samples_leaf']].to_dict()
In [50]:
params_precision
Out[50]:
{'class_weight': 'balanced',
 'max_depth': 4,
 'min_samples_leaf': 1,
 'n_estimators': 64}
In [51]:
clf = ensemble.RandomForestClassifier(**params_precision, n_jobs=-1)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[51]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=4, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [52]:
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
In [53]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

In [54]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
In [55]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

Best f1 model

In [56]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [57]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [58]:
best_f1 = runs_df.iloc[runs_df['f1'].idxmax()]
In [59]:
print(best_f1.equals(best_recall))
print(best_f1.equals(best_accuracy))
print(best_f1.equals(best_precision))
False
True
False

Same model as best accuracy - scroll up.

Train on all NSRDB data, test various freq of ground data

In [60]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf = ensemble.RandomForestClassifier(**best_precision[['max_depth', 'class_weight', 'min_samples_leaf']].to_dict(), n_estimators=100)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[60]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=4, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [61]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

30 min freq ground data

In [62]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [63]:
test.trim_dates('10-01-2015', '11-01-2015')
In [64]:
test.df = test.df[test.df.index.minute % 30 == 0]
In [65]:
test.df.keys()
Out[65]:
Index(['GHI', 'Clearsky GHI pvlib', 'Clearsky GHI stat', 'sky_status pvlib',
       'Clearsky GHI stat smooth', 'ghi_status', 'scale'],
      dtype='object')
In [66]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [67]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

In [68]:
train2 = cs_detection.ClearskyDetection(nsrdb.df)
train2.intersection(test.df.index)
In [69]:
nsrdb_clear = train2.df['sky_status'].values
ml_clear = pred
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [70]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')

15 min freq ground data

In [71]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [72]:
test.trim_dates('10-01-2015', '10-17-2015')
In [73]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [74]:
test.df = test.df[test.df.index.minute % 15 == 0]
# test.df = test.df.resample('15T').apply(lambda x: x[len(x) // 2])
In [75]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 5, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

In [76]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='15min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [77]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [78]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')

10 min freq ground data

In [79]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [80]:
test.trim_dates('10-01-2015', '10-08-2015')
In [81]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [82]:
test.df = test.df[test.df.index.minute % 10 == 0]
In [83]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 7, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

In [84]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-08-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='10min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [85]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [86]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]

visualize.plot_ts_slider_highligther(test.df, prob='probas')

5 min freq ground data

In [87]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [88]:
test.trim_dates('10-01-2015', '10-04-2015')
In [89]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [90]:
test.df = test.df[test.df.index.minute % 5 == 0]
In [91]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 13, multiproc=True, by_day=True).astype(bool)
In [92]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='5min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [93]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
/Users/benellis/miniconda3/lib/python3.5/site-packages/ipykernel_launcher.py:6: UserWarning:

Boolean Series key will be reindexed to match DataFrame index.

/Users/benellis/miniconda3/lib/python3.5/site-packages/ipykernel_launcher.py:7: UserWarning:

Boolean Series key will be reindexed to match DataFrame index.

/Users/benellis/miniconda3/lib/python3.5/site-packages/ipykernel_launcher.py:8: UserWarning:

Boolean Series key will be reindexed to match DataFrame index.

In [94]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]

visualize.plot_ts_slider_highligther(test.df, prob='probas')

1 min freq ground data

In [95]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [96]:
test.trim_dates('10-01-2015', '10-08-2015')
In [97]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [98]:
test.df = test.df[test.df.index.minute % 1 == 0]
In [99]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 61, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

In [100]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-08-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='1min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [101]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [102]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')

Save model

In [ ]:
import pickle
In [ ]:
with open('8_abq_direction_features_model.pkl', 'wb') as f:
    pickle.dump(clf, f)
In [ ]:
!ls *abq*

Conclusion

In general, the clear sky identification looks good. At lower frequencies (30 min, 15 min) we see good agreement with NSRDB labeled points. I suspect this could be further improved my doing a larger hyperparameter search, or even doing some feature extraction/reduction/additions.

In [ ]: